#Read in the dataData_2002 <- data.table::fread('ad_viz_plotval_data_2002.csv')Data_2022 <- data.table::fread('ad_viz_plotval_data_2022.csv')summary(Data_2002)
Date Source Site ID POC
Length:15976 Length:15976 Min. :60010007 Min. :1.000
Class :character Class :character 1st Qu.:60290014 1st Qu.:1.000
Mode :character Mode :character Median :60590007 Median :1.000
Mean :60549600 Mean :1.581
3rd Qu.:60731002 3rd Qu.:1.000
Max. :61131003 Max. :6.000
Daily Mean PM2.5 Concentration UNITS DAILY_AQI_VALUE
Min. : 0.00 Length:15976 Min. : 0.00
1st Qu.: 7.00 Class :character 1st Qu.: 29.00
Median : 12.00 Mode :character Median : 50.00
Mean : 16.12 Mean : 53.68
3rd Qu.: 20.50 3rd Qu.: 69.00
Max. :104.30 Max. :176.00
Site Name DAILY_OBS_COUNT PERCENT_COMPLETE AQS_PARAMETER_CODE
Length:15976 Min. :1 Min. :100 Min. :88101
Class :character 1st Qu.:1 1st Qu.:100 1st Qu.:88101
Mode :character Median :1 Median :100 Median :88101
Mean :1 Mean :100 Mean :88215
3rd Qu.:1 3rd Qu.:100 3rd Qu.:88502
Max. :1 Max. :100 Max. :88502
AQS_PARAMETER_DESC CBSA_CODE CBSA_NAME STATE_CODE
Length:15976 Min. :12540 Length:15976 Min. :6
Class :character 1st Qu.:23420 Class :character 1st Qu.:6
Mode :character Median :40140 Mode :character Median :6
Mean :33270 Mean :6
3rd Qu.:41740 3rd Qu.:6
Max. :49700 Max. :6
NA's :929
STATE COUNTY_CODE COUNTY SITE_LATITUDE
Length:15976 Min. : 1.00 Length:15976 Min. :32.63
Class :character 1st Qu.: 29.00 Class :character 1st Qu.:34.07
Mode :character Median : 59.00 Mode :character Median :35.36
Mean : 54.78 Mean :36.00
3rd Qu.: 73.00 3rd Qu.:37.77
Max. :113.00 Max. :41.71
SITE_LONGITUDE
Min. :-124.2
1st Qu.:-121.4
Median :-119.1
Mean :-119.4
3rd Qu.:-117.9
Max. :-115.5
summary(Data_2022)
Date Source Site ID POC
Length:57775 Length:57775 Min. :60010007 Min. : 1.000
Class :character Class :character 1st Qu.:60311004 1st Qu.: 1.000
Mode :character Mode :character Median :60631007 Median : 3.000
Mean :60571692 Mean : 2.531
3rd Qu.:60771003 3rd Qu.: 3.000
Max. :61131003 Max. :21.000
Daily Mean PM2.5 Concentration UNITS DAILY_AQI_VALUE
Min. : -2.200 Length:57775 Min. : 0.00
1st Qu.: 4.200 Class :character 1st Qu.: 18.00
Median : 7.000 Mode :character Median : 29.00
Mean : 8.574 Mean : 32.95
3rd Qu.: 10.900 3rd Qu.: 45.00
Max. :302.500 Max. :353.00
Site Name DAILY_OBS_COUNT PERCENT_COMPLETE AQS_PARAMETER_CODE
Length:57775 Min. :1 Min. :100 Min. :88101
Class :character 1st Qu.:1 1st Qu.:100 1st Qu.:88101
Mode :character Median :1 Median :100 Median :88101
Mean :1 Mean :100 Mean :88196
3rd Qu.:1 3rd Qu.:100 3rd Qu.:88101
Max. :1 Max. :100 Max. :88502
AQS_PARAMETER_DESC CBSA_CODE CBSA_NAME STATE_CODE
Length:57775 Min. :12540 Length:57775 Min. :6
Class :character 1st Qu.:31080 Class :character 1st Qu.:6
Mode :character Median :40140 Mode :character Median :6
Mean :35447 Mean :6
3rd Qu.:41860 3rd Qu.:6
Max. :49700 Max. :6
NA's :4761
STATE COUNTY_CODE COUNTY SITE_LATITUDE
Length:57775 Min. : 1.00 Length:57775 Min. :32.58
Class :character 1st Qu.: 31.00 Class :character 1st Qu.:34.14
Mode :character Median : 63.00 Mode :character Median :36.60
Mean : 57.02 Mean :36.37
3rd Qu.: 77.00 3rd Qu.:38.10
Max. :113.00 Max. :41.76
SITE_LONGITUDE
Min. :-124.2
1st Qu.:-121.5
Median :-119.8
Mean :-119.7
3rd Qu.:-118.1
Max. :-115.5
head(Data_2002)
Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
1: 01/05/2002 AQS 60010007 1 25.1 ug/m3 LC
2: 01/06/2002 AQS 60010007 1 31.6 ug/m3 LC
3: 01/08/2002 AQS 60010007 1 21.4 ug/m3 LC
4: 01/11/2002 AQS 60010007 1 25.9 ug/m3 LC
5: 01/14/2002 AQS 60010007 1 34.5 ug/m3 LC
6: 01/17/2002 AQS 60010007 1 41.0 ug/m3 LC
DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1: 78 Livermore 1 100
2: 92 Livermore 1 100
3: 71 Livermore 1 100
4: 80 Livermore 1 100
5: 98 Livermore 1 100
6: 115 Livermore 1 100
AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
1: 88101 PM2.5 - Local Conditions 41860
2: 88101 PM2.5 - Local Conditions 41860
3: 88101 PM2.5 - Local Conditions 41860
4: 88101 PM2.5 - Local Conditions 41860
5: 88101 PM2.5 - Local Conditions 41860
6: 88101 PM2.5 - Local Conditions 41860
CBSA_NAME STATE_CODE STATE COUNTY_CODE COUNTY
1: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
2: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
3: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
4: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
5: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
6: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
SITE_LATITUDE SITE_LONGITUDE
1: 37.68753 -121.7842
2: 37.68753 -121.7842
3: 37.68753 -121.7842
4: 37.68753 -121.7842
5: 37.68753 -121.7842
6: 37.68753 -121.7842
head(Data_2022)
Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
1: 01/01/2022 AQS 60010007 3 12.7 ug/m3 LC
2: 01/02/2022 AQS 60010007 3 13.9 ug/m3 LC
3: 01/03/2022 AQS 60010007 3 7.1 ug/m3 LC
4: 01/04/2022 AQS 60010007 3 3.7 ug/m3 LC
5: 01/05/2022 AQS 60010007 3 4.2 ug/m3 LC
6: 01/06/2022 AQS 60010007 3 3.8 ug/m3 LC
DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1: 52 Livermore 1 100
2: 55 Livermore 1 100
3: 30 Livermore 1 100
4: 15 Livermore 1 100
5: 18 Livermore 1 100
6: 16 Livermore 1 100
AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
1: 88101 PM2.5 - Local Conditions 41860
2: 88101 PM2.5 - Local Conditions 41860
3: 88101 PM2.5 - Local Conditions 41860
4: 88101 PM2.5 - Local Conditions 41860
5: 88101 PM2.5 - Local Conditions 41860
6: 88101 PM2.5 - Local Conditions 41860
CBSA_NAME STATE_CODE STATE COUNTY_CODE COUNTY
1: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
2: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
3: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
4: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
5: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
6: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
SITE_LATITUDE SITE_LONGITUDE
1: 37.68753 -121.7842
2: 37.68753 -121.7842
3: 37.68753 -121.7842
4: 37.68753 -121.7842
5: 37.68753 -121.7842
6: 37.68753 -121.7842
tail(Data_2002)
Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
1: 12/10/2002 AQS 61131003 1 15 ug/m3 LC
2: 12/13/2002 AQS 61131003 1 15 ug/m3 LC
3: 12/22/2002 AQS 61131003 1 1 ug/m3 LC
4: 12/25/2002 AQS 61131003 1 23 ug/m3 LC
5: 12/28/2002 AQS 61131003 1 5 ug/m3 LC
6: 12/31/2002 AQS 61131003 1 6 ug/m3 LC
DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1: 57 Woodland-Gibson Road 1 100
2: 57 Woodland-Gibson Road 1 100
3: 4 Woodland-Gibson Road 1 100
4: 74 Woodland-Gibson Road 1 100
5: 21 Woodland-Gibson Road 1 100
6: 25 Woodland-Gibson Road 1 100
AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
1: 88101 PM2.5 - Local Conditions 40900
2: 88101 PM2.5 - Local Conditions 40900
3: 88101 PM2.5 - Local Conditions 40900
4: 88101 PM2.5 - Local Conditions 40900
5: 88101 PM2.5 - Local Conditions 40900
6: 88101 PM2.5 - Local Conditions 40900
CBSA_NAME STATE_CODE STATE COUNTY_CODE
1: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
2: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
3: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
4: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
5: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
6: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
COUNTY SITE_LATITUDE SITE_LONGITUDE
1: Yolo 38.66121 -121.7327
2: Yolo 38.66121 -121.7327
3: Yolo 38.66121 -121.7327
4: Yolo 38.66121 -121.7327
5: Yolo 38.66121 -121.7327
6: Yolo 38.66121 -121.7327
tail(Data_2002)
Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
1: 12/10/2002 AQS 61131003 1 15 ug/m3 LC
2: 12/13/2002 AQS 61131003 1 15 ug/m3 LC
3: 12/22/2002 AQS 61131003 1 1 ug/m3 LC
4: 12/25/2002 AQS 61131003 1 23 ug/m3 LC
5: 12/28/2002 AQS 61131003 1 5 ug/m3 LC
6: 12/31/2002 AQS 61131003 1 6 ug/m3 LC
DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1: 57 Woodland-Gibson Road 1 100
2: 57 Woodland-Gibson Road 1 100
3: 4 Woodland-Gibson Road 1 100
4: 74 Woodland-Gibson Road 1 100
5: 21 Woodland-Gibson Road 1 100
6: 25 Woodland-Gibson Road 1 100
AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
1: 88101 PM2.5 - Local Conditions 40900
2: 88101 PM2.5 - Local Conditions 40900
3: 88101 PM2.5 - Local Conditions 40900
4: 88101 PM2.5 - Local Conditions 40900
5: 88101 PM2.5 - Local Conditions 40900
6: 88101 PM2.5 - Local Conditions 40900
CBSA_NAME STATE_CODE STATE COUNTY_CODE
1: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
2: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
3: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
4: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
5: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
6: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
COUNTY SITE_LATITUDE SITE_LONGITUDE
1: Yolo 38.66121 -121.7327
2: Yolo 38.66121 -121.7327
3: Yolo 38.66121 -121.7327
4: Yolo 38.66121 -121.7327
5: Yolo 38.66121 -121.7327
6: Yolo 38.66121 -121.7327
Both of the data sets have the same 20 variables. Data_2002 has 15976 observations, while Data_2022 has 57775 observations (almost 3 times as many).
#To vertically concetenate the data setsData_2002$Year <-2002Data_2022$Year <-2022all_data <-rbind(Data_2002, Data_2022)# Change Daily Mean PM2.5 Concentration to PM2.5names(all_data)[names(all_data) =="Daily Mean PM2.5 Concentration"] <-"PM2.5"names(all_data)[names(all_data) =="SITE_LATITUDE"] <-"lat"names(all_data)[names(all_data) =="SITE_LONGITUDE"] <-"lon"names(all_data)[names(all_data) =="Site Name"] <-"Site_Name"
Leaflet map
library(leaflet)library(leaflet.extras)# Create a color palette with custom colorscolor_pal <-colorNumeric(palette =c("blue", "red"), # Define custom colorsdomain = all_data$Year # Set the range of values)# Create a leaflet mapmap <-leaflet(data = all_data) %>%addProviderTiles("CartoDB.Positron") %>%addCircleMarkers(lat =~lat,lng =~lon,radius =3,fillOpacity =0.8,color =~color_pal(Year), # Assign colors based on relative humidity values ) %>%addLegend(title ="Site Location by Year",colors =c("blue", "red"), # Custom colorslabels =c("2002", "2022"), # Color labelsopacity =3,position ="bottomleft" )# Show the mapmap
There are many more locations in 2022 than in 2002. There are many more sites around/near the large coastal cities of San Diego, Los Angeles, and San Francisco.
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(ggplot2)library(lubridate)
Attaching package: 'lubridate'
The following objects are masked from 'package:base':
date, intersect, setdiff, union
all_data$Date <-mdy(all_data$Date)# Assuming you have a date or timestamp variable called "DateTime"# Create a histogram of PM2.5 valuesggplot(all_data, aes(x = PM2.5)) +geom_histogram(binwidth =5, fill ="blue", color ="black") +labs(title ="Distribution of PM2.5 Values",x ="PM2.5 Value", y ="Frequency") +theme_minimal()
# Create a new data frame with counts of missing and implausible values by datesummary_data <- all_data %>%mutate(Implausible = PM2.5<0| PM2.5>500) %>%group_by(Date =as.Date(Date)) %>%summarise(Count_Implausible =sum(Implausible))# Create a line plot to visualize the counts over timeggplot(summary_data, aes(x = Date)) +geom_line(aes(y = Count_Implausible, color ="Implausible")) +labs(title ="Implausible PM2.5 Values Over Time",x ="Date", y ="Count") +scale_color_manual(values =c("Missing"="red", "Implausible"="blue")) +theme_minimal()
There are 143 implausible data points, they occur largely in the beginning of the data set and at the end. A PM 2.5 of greater than 500 seems implausible and all of those values occur in 2022 which could possibly be explained by more testing sites, more measurements being taken, or issues with instrumentation.
# Load necessary librarieslibrary(ggplot2)library(dplyr)# State Level Analysisstate_summary <- all_data %>%group_by(Year) %>%summarise(Mean_PM2.5 =mean(PM2.5, na.rm =TRUE),SD_PM2.5 =sd(PM2.5, na.rm =TRUE) )# Plot: Barplot of average PM2.5 levels by yearggplot(state_summary, aes(x =as.factor(Year), y = Mean_PM2.5)) +geom_bar(stat ="identity", fill ="skyblue") +labs(title ="Average PM2.5 Levels by Year in California", x ="Year", y ="Mean PM2.5") +theme_minimal()
# Plot: Histogram of PM2.5 levels within Californiaggplot(all_data, aes(x = PM2.5)) +geom_histogram(binwidth =2, fill ="lightgreen") +labs(title ="PM2.5 Distribution in California", x ="PM2.5 Levels") +theme_minimal()
# County level analysiscounty_summary <- all_data %>%group_by(COUNTY) %>%summarise(Mean_PM2.5 =mean(PM2.5, na.rm =TRUE),SD_PM2.5 =sd(PM2.5, na.rm =TRUE),Median_PM2.5 =median(PM2.5, na.rm =TRUE) )# Plot: Create boxplots for each county to visualize PM2.5 distributionggplot(all_data, aes(x = COUNTY, y = PM2.5)) +geom_boxplot(fill ="lightblue") +labs(title ="PM2.5 Distribution by County", x ="County", y ="PM2.5 Levels") +theme_minimal() +theme(axis.text.x =element_text(angle =45, hjust =1)) # Rotate x-axis labels for better readability
# Site Level Analysis (Los Angeles County)la_county_data <- all_data %>%filter(COUNTY =="Los Angeles")# Summary Statistic: Mean and SD of PM2.5 levels at monitoring sites in LA Countyla_summary <- la_county_data %>%group_by(Year) %>%summarise(Mean_PM2.5 =mean(PM2.5, na.rm =TRUE),SD_PM2.5 =sd(PM2.5, na.rm =TRUE) )# Define custom colors for LA County and Statela_color <-"red"state_color <-"blue"# Plot: Barplot of average PM2.5 levels by year for Los Angeles County and the entire stateggplot() +geom_bar(data = la_summary, aes(x =as.factor(Year), y = Mean_PM2.5, fill ="LA County"), stat ="identity", position ="dodge") +geom_bar(data = state_summary, aes(x =as.factor(Year), y = Mean_PM2.5, fill ="State"), stat ="identity", position ="dodge") +labs(title ="Average PM2.5 Levels in LA County and California", x ="Year", y ="Mean PM2.5") +scale_fill_manual(values =c("LA County"= la_color, "State"= state_color)) +theme_minimal() +guides(fill =guide_legend(title ="Location"))
At the state level, we look at the histogram of PM 2.5 levels in California, and find that the data is distributed with the most counts around 10.
At the county level, we look at the PM 2.5 distribution by county. We find that counties like Placer, Nevada, Trinity, and Siskiyou have some of the highest PM 2.5 levels in the state.
At the LA county level, we look at the over PM 2.5 level in 2002 and 2022 and find that LA county is higher than the state average in both years, but both the state and the county average is lower in 2022.